Starting form the filtered table from ‘HMP_coverage.Rmd’. Run a series of analysis to look at relationships between body site and subjects.
print(date())
## [1] "Mon Aug 31 14:21:31 2015"
library(reshape2)
#library(igraph)
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#library(biomod2)
library(e1071)
library(RColorBrewer)
library(gdata)
## gdata: read.xls support for 'XLS' (Excel 97-2004) files ENABLED.
##
## gdata: read.xls support for 'XLSX' (Excel 2007+) files ENABLED.
##
## Attaching package: 'gdata'
##
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
##
## The following object is masked from 'package:stats':
##
## nobs
##
## The following object is masked from 'package:utils':
##
## object.size
library(vegan)
## Loading required package: permute
## Loading required package: lattice
## This is vegan 2.3-0
library(assertthat)
source('./staph_metagenome_tools.R', echo=TRUE)
##
## > bintr <- function(mat, cutoff) {
## + mat[which(mat > cutoff)] <- 1
## + mat[which(!(mat > cutoff))] <- 0
## + return(mat)
## + }
##
## > calc_FTS <- function(pop, mini) {
## + fishmat <- matrix(c(mini[1], mini[2], pop[1] - mini[1], pop[2] -
## + mini[2]), ncol = 2, nrow = 2)
## + .... [TRUNCATED]
##
## > calc_hits <- function(nameset, mat) {
## + minimat <- select(mat, one_of(nameset))[rownames(mat) %in%
## + nameset, ]
## + minimat.size <- ( .... [TRUNCATED]
##
## > calc_hits_slice <- function(nameset, mat) {
## + minimat <- slice(mat, nameset)[, nameset]
## + minimat.hits <- sum(minimat)/2
## + return(minima .... [TRUNCATED]
##
## > create_cooccur_mat <- function(mat) {
## + library(reshape2)
## + dat2 <- melt(mat)
## + w <- dcast(dat2, V2 ~ V1)
## + x <- as.matrix(w[, -1])
## .... [TRUNCATED]
##
## > genotypes_plot <- function(mat, tit) {
## + top_genos <- c("CC_30", "CC_8", "CC_45", "CC_398", "CC_133",
## + "CC_59", "CC_15", "CC_97", "CC_ ..." ... [TRUNCATED]
##
## > run_bs_subj_adonis <- function(df, bs_vec, subj_vec) {
## + library(e1071)
## + library(vegan)
## + body_site_adonis <- adonis(df ~ bs_vec)
## + .... [TRUNCATED]
##
## > make_subtype_matrix <- function(df) {
## + library(dplyr)
## + check_staph_df(df)
## + mat <- select(df, matches("CC")) %>% as.matrix
## + asser .... [TRUNCATED]
##
## > plot_coverages <- function(combined.df, titl) {
## + check_staph_df(combined.df)
## + par(mar = c(12, 4, 4, 2), cex = 0.8)
## + with(combined.df, .... [TRUNCATED]
##
## > plot_adjusted_coverages <- function(combined.df, titl) {
## + check_staph_df(combined.df)
## + stcols <- grep("CC|MLST", colnames(combined.df))
## + .... [TRUNCATED]
##
## > plot_mecA <- function(combined.df, titl) {
## + check_staph_df(combined.df)
## + with(combined.df, plot(Staph_cov, mecA_cov, col = Body.site,
## + .... [TRUNCATED]
##
## > plot_diversity_vers_cov <- function(combined.df, titl) {
## + library(vegan)
## + check_staph_df(combined.df)
## + stcols <- grep("CC|MLST", coln .... [TRUNCATED]
##
## > check_staph_df <- function(df) {
## + library(assertthat)
## + assert_that(length(grep("Body.site", colnames(df))) == 1)
## + assert_that(length( .... [TRUNCATED]
##
## > subject_perm <- function(df, multiSubjects, hamming_mat) {
## + library(gdata)
## + check_staph_df(df)
## + sub1.hits = 0
## + sub1.cells = 0
## + .... [TRUNCATED]
##
## > by_factor_perm <- function(bs, df, hamming_mat) {
## + check_staph_df(df)
## + for (i in bs) {
## + bss_rows <- which(df$Body.site == i)
## + .... [TRUNCATED]
##
## > intra_body_FTS <- function(body1, body2, df, multiSubjects,
## + u) {
## + library(dplyr)
## + check_staph_df(df)
## + temp.an <- filter(df, Bo .... [TRUNCATED]
##
## > merge_CCs <- function(in_data, CC) {
## + new_col <- select(in_data, matches(CC)) %>% rowSums()
## + in_data <- select(in_data, -(matches(CC)))
## + .... [TRUNCATED]
dat4 <- read.table("./Data/cov0.025")
#list of all subjects with more than one sample
multiSubjects <- count(dat4,Subject.Id) %>% filter(n > 1) %>% select(Subject.Id )
dat5 <- make_subtype_matrix(dat4)
#create Hamming dist matrices with and without cutof min value of 0.2
dat4$Subject.Id <- as.factor(dat4$Subject.Id)
dat6 <- make_subtype_matrix(dat4) %>% bintr(0.2) %>% hamming.distance %>% data.frame
dat8 <- make_subtype_matrix(dat4) %>% hamming.distance %>% data.frame
test for significant associations of subtype with with bodysite and subject. us e Hamming dist. matrix. Two levels, one with a beta cutoff for all samples > 0.2 and one without
set.seed(344098)
run_bs_subj_adonis(dat6,dat4$Body.site,dat4$Subject.Id)
##
## Call:
## adonis(formula = df ~ bs_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## bs_vec 13 1.0189 0.078380 3.0876 0.11464 0.001 ***
## Residuals 310 7.8695 0.025385 0.88536
## Total 323 8.8884 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Homogeneity of multivariate dispersions
##
## Call: betadisper(d = dist(df), group = bs_vec)
##
## No. of Positive Eigenvalues: 30
## No. of Negative Eigenvalues: 0
##
## Average distance to median:
## anterior nares attached keratinized gingiva
## 15.517 10.909
## buccal mucosa hard palate
## 14.161 0.000
## left retroauricular crease palatine tonsil
## 15.615 12.828
## posterior fornnix right antecubital fossa
## 9.607 0.000
## right retroauricular crease saliva
## 18.573 0.000
## stool subgingival_plaque
## 10.490 0.000
## supragingival plaque tongue dorsum
## 14.060 16.631
##
## Eigenvalues for PCoA axes:
## PCoA1 PCoA2 PCoA3 PCoA4 PCoA5 PCoA6 PCoA7
## 39060.323 33060.993 7785.027 3555.066 2349.365 1908.334 1347.009
## PCoA8
## 1007.032
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 13 2070.839 159.29527 5.116226 999 0.001
## Residuals 310 9651.945 31.13531 NA NA NA
##
## Call:
## adonis(formula = df ~ subj_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## subj_vec 107 3.5185 0.032883 1.3227 0.39585 0.006 **
## Residuals 216 5.3699 0.024861 0.60415
## Total 323 8.8884 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Homogeneity of multivariate dispersions
##
## Call: betadisper(d = dist(df), group = subj_vec)
##
## No. of Positive Eigenvalues: 30
## No. of Negative Eigenvalues: 0
##
## Average distance to median:
## 1 2 3 4 5 6 7
## 9.000e+00 1.175e+01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 1.000e+01
## 8 9 10 11 12 13 15
## 2.500e+00 9.165e+00 2.382e+01 1.183e+01 1.549e+01 1.322e+01 1.731e+01
## 16 17 18 19 20 21 22
## 0.000e+00 0.000e+00 9.523e+00 0.000e+00 0.000e+00 9.000e+00 1.179e+01
## 23 24 25 26 27 28 29
## 0.000e+00 0.000e+00 1.481e+01 6.874e+00 1.171e+01 1.309e+01 1.337e+01
## 30 31 32 33 34 35 36
## 9.849e+00 1.005e+01 1.576e+01 1.487e+01 1.183e+01 1.275e+01 0.000e+00
## 37 38 39 40 41 42 43
## 1.675e+01 4.796e+00 1.553e+01 1.273e+01 1.500e+01 1.078e+01 1.123e+01
## 44 45 46 47 48 49 50
## 6.633e+00 0.000e+00 0.000e+00 0.000e+00 1.375e+01 0.000e+00 0.000e+00
## 51 52 53 54 55 56 57
## 0.000e+00 1.024e-13 0.000e+00 0.000e+00 1.315e+01 1.886e+01 1.183e+01
## 58 60 61 62 63 64 65
## 0.000e+00 1.506e+01 0.000e+00 0.000e+00 1.264e+01 0.000e+00 1.375e+01
## 66 67 68 69 70 71 72
## 1.108e+01 4.372e+00 1.385e+01 1.411e+01 1.042e+01 8.028e+00 1.697e+01
## 73 74 75 76 77 78 79
## 1.642e+01 6.620e+00 6.000e+00 7.700e+00 1.204e+01 8.448e+00 2.289e-14
## 80 81 82 83 84 85 86
## 1.119e+01 1.083e+01 5.672e-14 5.408e-14 2.981e+00 0.000e+00 1.706e+01
## 87 88 89 90 91 92 93
## 1.375e+01 0.000e+00 0.000e+00 0.000e+00 1.136e+01 1.153e+01 1.245e+01
## 94 95 96 97 98 99 100
## 1.892e+01 1.584e+01 0.000e+00 1.344e+01 0.000e+00 0.000e+00 1.170e+01
## 101 102 103 104 105 106 107
## 0.000e+00 1.054e+01 1.183e+01 0.000e+00 0.000e+00 1.310e+01 9.849e+00
## 108 109 110
## 1.375e+01 0.000e+00 9.000e+00
##
## Eigenvalues for PCoA axes:
## PCoA1 PCoA2 PCoA3 PCoA4 PCoA5 PCoA6 PCoA7
## 39060.323 33060.993 7785.027 3555.066 2349.365 1908.334 1347.009
## PCoA8
## 1007.032
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 107 9985.80 93.32523 1.109574 999 0.268
## Residuals 216 18167.57 84.10910 NA NA NA
##
## Call:
## adonis(formula = df ~ bs_vec + subj_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## bs_vec 13 1.0189 0.078380 3.4193 0.11464 0.001 ***
## subj_vec 106 3.1931 0.030124 1.3141 0.35925 0.002 **
## Residuals 204 4.6763 0.022923 0.52612
## Total 323 8.8884 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
run_bs_subj_adonis(dat8,dat4$Body.site,dat4$Subject.Id)
##
## Call:
## adonis(formula = df ~ bs_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## bs_vec 13 2.6878 0.206751 13.113 0.35479 0.001 ***
## Residuals 310 4.8879 0.015768 0.64521
## Total 323 7.5757 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Homogeneity of multivariate dispersions
##
## Call: betadisper(d = dist(df), group = bs_vec)
##
## No. of Positive Eigenvalues: 273
## No. of Negative Eigenvalues: 0
##
## Average distance to median:
## anterior nares attached keratinized gingiva
## 58.45 13.46
## buccal mucosa hard palate
## 15.84 0.00
## left retroauricular crease palatine tonsil
## 36.11 19.08
## posterior fornnix right antecubital fossa
## 11.96 0.00
## right retroauricular crease saliva
## 31.17 0.00
## stool subgingival_plaque
## 19.97 0.00
## supragingival plaque tongue dorsum
## 15.48 24.46
##
## Eigenvalues for PCoA axes:
## PCoA1 PCoA2 PCoA3 PCoA4 PCoA5 PCoA6
## 865836.235 21893.901 13653.453 6914.135 4028.043 3103.288
## PCoA7 PCoA8
## 2677.330 2261.559
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 13 86468.65 6651.435 5.335864 999 0.014
## Residuals 310 386431.25 1246.552 NA NA NA
##
## Call:
## adonis(formula = df ~ subj_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## subj_vec 107 2.2062 0.020618 0.82941 0.29121 0.882
## Residuals 216 5.3696 0.024859 0.70879
## Total 323 7.5757 1.00000
##
## Homogeneity of multivariate dispersions
##
## Call: betadisper(d = dist(df), group = subj_vec)
##
## No. of Positive Eigenvalues: 273
## No. of Negative Eigenvalues: 0
##
## Average distance to median:
## 1 2 3 4 5 6 7
## 1.541e+01 2.778e+01 0.000e+00 0.000e+00 0.000e+00 0.000e+00 5.976e+01
## 8 9 10 11 12 13 15
## 8.180e+01 2.471e+01 4.921e+01 2.192e+01 5.427e+01 5.960e+01 2.740e+01
## 16 17 18 19 20 21 22
## 0.000e+00 0.000e+00 4.663e+01 0.000e+00 0.000e+00 1.135e+01 1.016e+01
## 23 24 25 26 27 28 29
## 0.000e+00 0.000e+00 4.027e+01 1.444e+01 3.149e+01 2.831e+01 2.061e+01
## 30 31 32 33 34 35 36
## 2.766e+01 1.792e+02 3.360e+01 1.774e+01 9.574e+01 3.948e+01 0.000e+00
## 37 38 39 40 41 42 43
## 5.495e+01 3.311e+01 4.151e+01 2.597e+01 3.241e+01 5.517e+01 1.314e+02
## 44 45 46 47 48 49 50
## 7.347e+01 0.000e+00 0.000e+00 0.000e+00 8.818e+00 0.000e+00 0.000e+00
## 51 52 53 54 55 56 57
## 0.000e+00 4.513e+01 0.000e+00 0.000e+00 1.988e+01 3.044e+01 2.963e+01
## 58 60 61 62 63 64 65
## 0.000e+00 2.056e+01 0.000e+00 0.000e+00 3.001e+01 0.000e+00 1.694e+01
## 66 67 68 69 70 71 72
## 1.798e+01 3.940e+01 1.273e+01 1.449e+01 1.887e+01 1.845e+01 1.971e+01
## 73 74 75 76 77 78 79
## 2.148e+01 1.391e+01 1.534e+01 2.338e+01 1.607e+01 8.278e+00 7.533e+00
## 80 81 82 83 84 85 86
## 1.499e+01 1.522e+01 1.299e-12 1.934e-12 2.676e+01 0.000e+00 1.325e+01
## 87 88 89 90 91 92 93
## 8.818e+00 0.000e+00 0.000e+00 0.000e+00 1.014e+01 3.759e+01 1.101e+01
## 94 95 96 97 98 99 100
## 2.321e+01 1.636e+01 0.000e+00 6.259e+01 0.000e+00 0.000e+00 1.375e+01
## 101 102 103 104 105 106 107
## 0.000e+00 2.099e+01 1.550e+01 0.000e+00 0.000e+00 2.239e+01 1.078e+01
## 108 109 110
## 8.818e+00 0.000e+00 1.507e+01
##
## Eigenvalues for PCoA axes:
## PCoA1 PCoA2 PCoA3 PCoA4 PCoA5 PCoA6
## 865836.235 21893.901 13653.453 6914.135 4028.043 3103.288
## PCoA7 PCoA8
## 2677.330 2261.559
## Df Sum Sq Mean Sq F N.Perm Pr(>F)
## Groups 107 193706.7 1810.343 1.195398 999 0.267
## Residuals 216 327116.3 1514.427 NA NA NA
##
## Call:
## adonis(formula = df ~ bs_vec + subj_vec)
##
## Permutation: free
## Number of permutations: 999
##
## Terms added sequentially (first to last)
##
## Df SumsOfSqs MeanSqs F.Model R2 Pr(>F)
## bs_vec 13 2.6878 0.206751 13.3252 0.35479 0.001 ***
## subj_vec 106 1.7227 0.016252 1.0474 0.22740 0.378
## Residuals 204 3.1652 0.015516 0.41781
## Total 323 7.5757 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#test ffor whether intra-subject distance greater than intersubject
subject_perm(dat4,multiSubjects,dat6)
## Score for intraperson hits = 1412
## [1] "Quartlies for random distribution"
## 0% 25% 50% 75% 100%
## 1465 1568 1587 1606 1702
## Empirical p value = 0
#now look at the same test between body sites
bs <- levels(dat4$Body.site)
by_factor_perm(bs,dat4,dat6)
## [1] "anterior nares"
## [1] "Number of samples " "68"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 4195 5065 5242 5424 6291
## [1] 5301
## Empirical p value [1] 0.5902
##
## [1] "attached keratinized gingiva"
## [1] "Number of samples " "4"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 0 12 13 16 28
## [1] 8
## Empirical p value [1] 0.0544
##
## [1] "buccal mucosa"
## [1] "Number of samples " "56"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 2837 3405 3543 3684 4282
## [1] 2941
## Empirical p value [1] 0.0015
##
## Zero samples in hard palate[1] "left retroauricular crease"
## [1] "Number of samples " "23"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 370 546 582 620 790
## [1] 636
## Empirical p value [1] 0.8399
##
## [1] "palatine tonsil"
## [1] "Number of samples " "6"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 5 29 34 39 63
## [1] 36
## Empirical p value [1] 0.612
##
## [1] "posterior fornnix"
## [1] "Number of samples " "9"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 16 74 82 92 130
## [1] 48
## Empirical p value [1] 0.0075
##
## Zero samples in right antecubital fossa[1] "right retroauricular crease"
## [1] "Number of samples " "28"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 612 817 869 921 1170
## [1] 1081
## Empirical p value [1] 0.9965
##
## Zero samples in saliva[1] "stool"
## [1] "Number of samples " "7"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 6 42 48 54 84
## [1] 30
## Empirical p value [1] 0.031
##
## Zero samples in subgingival_plaque[1] "supragingival plaque"
## [1] "Number of samples " "37"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 1066 1454 1530 1606 1958
## [1] 1164
## Empirical p value [1] 6e-04
##
## [1] "tongue dorsum"
## [1] "Number of samples " "82"
## [1] "Distribution of random hits"
## 0% 25% 50% 75% 100%
## 6280.00 7402.00 7640.00 7872.25 9019.00
## [1] 7815
## Empirical p value [1] 0.6947
presence_mat <- as.data.frame(bintr(dat5,0.2))
top_score_mat <- as.data.frame(bintr(dat5,0.5))
# png("~/Dropbox/ARTICLES_BY_TDR/2015-staph-metagenome/HMP_barchart.png",width=640, height =640, res = 75)
# dev.off()
genotypes_plot(presence_mat,"All samples, subtypes present > 0.2")
genotypes_plot(top_score_mat,"All samples, subtypes present > 0.5")
for (i in bs) {
bss_rows <- which(dat4$Body.site == i)
if(length(bss_rows) > 0) {
bs_df <- slice(presence_mat,bss_rows)
genotypes_plot(bs_df,paste("Present: ", i))
}
}
for (i in bs) {
bss_rows <- which(dat4$Body.site == i)
if(length(bss_rows) > 0) {
bs_df <- slice(top_score_mat,bss_rows)
genotypes_plot(bs_df,paste("Top score: ", i))
}
}
###PCA
par(mfrow=c(2,2))
pcobj <- prcomp(dat6)
tr_gray <- rgb(0.5,.5,.5,.15)
for (i in bs) {
prcols <- rep(tr_gray,nrow(dat6))
prcols[which(dat4$Body.site == i)] <- "red"
plot(pcobj$x,col = prcols, pch = 16, main = i)
}
for (i in multiSubjects$Subject.Id) {
sub_rows = which(dat4$Subject.Id == as.character(i))
if (length(sub_rows) > 3){
prcols <- rep(tr_gray,nrow(dat6))
prcols[sub_rows] <- "blue"
plot(pcobj$x,col = prcols, pch = 16, main = c("Subject",i))
}
}
###Session Info
sessionInfo()
## R version 3.2.1 (2015-06-18)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: OS X 10.10.5 (Yosemite)
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] assertthat_0.1 vegan_2.3-0 lattice_0.20-33
## [4] permute_0.8-4 gdata_2.17.0 RColorBrewer_1.1-2
## [7] e1071_1.6-7 dplyr_0.4.2 reshape2_1.4.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.0 cluster_2.0.3 knitr_1.10.5 magrittr_1.5
## [5] MASS_7.3-43 R6_2.1.0 stringr_1.0.0 plyr_1.8.3
## [9] tools_3.2.1 parallel_3.2.1 grid_3.2.1 nlme_3.1-121
## [13] mgcv_1.8-7 DBI_0.3.1 htmltools_0.2.6 class_7.3-13
## [17] gtools_3.5.0 lazyeval_0.1.10 yaml_2.1.13 digest_0.6.8
## [21] Matrix_1.2-2 evaluate_0.7 rmarkdown_0.7 stringi_0.5-5